Read training dataset
zip <- as.matrix(read.table("/Users/pawanjeetkaur/Downloads/IDS-575- MachineLearning Core/Assignment_2/zip.train"))
zip_test <- as.matrix(read.table("/Users/pawanjeetkaur/Downloads/IDS-575- MachineLearning Core/Assignment_2/zip.test"))
#Pick data for 2 and 3's
data_2_3_digits <- which(zip[, 1] == 2 | zip[, 1] == 3)
nrow(as.matrix(data_2_3_digits))
## [1] 1389
#Pick data for 2 and 3's
data_2_3_digits_test <- which(zip_test[, 1] == 2 | zip_test[, 1] == 3)
nrow(as.matrix(data_2_3_digits_test))
## [1] 364
dim(zip)
## [1] 7291 257
summary(zip[,1])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 4.000 3.903 7.000 9.000
Divide training dataset in independent and dependent variables for knn
data_full_train <- data.frame(zip[data_2_3_digits,])
zip_X_train <- data_full_train[,-1]
zip_Y_train <- data_full_train[,1]
data_full_test <- data.frame(zip_test[data_2_3_digits_test,])
zip_X_test <- data_full_test[,-1]
zip_Y_test <- data_full_test[,1]
Run multiple regression
lm_reg <- lm(V1 ~ . , data = data_full_train)
#summary(lm_reg)
Run prediction on training data
predict_output_train = sapply(predict(lm_reg , newdata = data_full_train), round)
conf_matrix_train <- table(predict_output_train, zip_Y_train)
# Accuracy of model test data
accuracy_train = sum(diag(conf_matrix_train))/ sum(conf_matrix_train)
accuracy_train
## [1] 0.9942405
error_train = 1-accuracy_train
error_train
## [1] 0.005759539
Run prediction on test data
predict_output_test = sapply(predict(lm_reg , newdata = data_full_test ), round)
conf_matrix_test <- table(predict_output_test, zip_Y_test)
# Accuracy of model test data
accuracy_test = sum(diag(conf_matrix_test))/ sum(conf_matrix_test)
accuracy_test
## [1] 0.9587912
error_test = 1-accuracy_test
error_test
## [1] 0.04120879
knn for k = 1
library(class)
knn_1 <- knn(zip_X_train, zip_X_test , cl = zip_Y_train , k=1, use.all = TRUE)
conf_matrix_knn_1_test <- table(knn_1, zip_Y_test)
accuracy_knn_1_test <- sum(diag(conf_matrix_knn_1_test))/ sum(conf_matrix_knn_1_test)
accuracy_knn_1_test
## [1] 0.9752747
error_knn_1_test <- 1 - accuracy_knn_1_test
error_knn_1_test
## [1] 0.02472527
knn_1_train <- knn(zip_X_train, zip_X_train , cl = zip_Y_train , k=1, use.all = TRUE)
conf_matrix_knn_1_train <- table(knn_1_train, zip_Y_train)
accuracy_knn_train_1 <- sum(diag(conf_matrix_knn_1_train))/ sum(conf_matrix_knn_1_train)
accuracy_knn_train_1
## [1] 1
error_knn_1_train <- 1 - accuracy_knn_train_1
error_knn_1_train
## [1] 0
knn for class 3
knn_3 <- knn(zip_X_train, zip_X_test , cl = zip_Y_train , k=3, use.all = TRUE)
conf_matrix_knn_3_test <- table(knn_3, zip_Y_test)
accuracy_knn_3_test <- sum(diag(conf_matrix_knn_3_test))/ sum(conf_matrix_knn_3_test)
accuracy_knn_3_test
## [1] 0.9697802
error_knn_3_test <- 1 - accuracy_knn_3_test
error_knn_3_test
## [1] 0.03021978
knn_3_train <- knn(zip_X_train, zip_X_train , cl = zip_Y_train , k=3, use.all = TRUE)
conf_matrix_knn_3_train <- table(knn_3_train, zip_Y_train)
accuracy_knn_train_3 <- sum(diag(conf_matrix_knn_3_train))/ sum(conf_matrix_knn_3_train)
accuracy_knn_train_3
## [1] 0.9949604
error_knn_3_train <- 1 - accuracy_knn_train_3
error_knn_3_train
## [1] 0.005039597
knn for class 5
knn_5 <- knn(zip_X_train, zip_X_test , cl = zip_Y_train , k=5, use.all = TRUE)
conf_matrix_knn_5_test <- table(knn_5, zip_Y_test)
accuracy_knn_5_test <- sum(diag(conf_matrix_knn_5_test))/ sum(conf_matrix_knn_5_test)
accuracy_knn_5_test
## [1] 0.9697802
error_knn_5_test <- 1 - accuracy_knn_5_test
error_knn_5_test
## [1] 0.03021978
knn_5_train <- knn(zip_X_train, zip_X_train , cl = zip_Y_train , k=5, use.all = TRUE)
conf_matrix_knn_5_train <- table(knn_5_train, zip_Y_train)
accuracy_knn_train_5 <- sum(diag(conf_matrix_knn_5_train))/ sum(conf_matrix_knn_5_train)
accuracy_knn_train_5
## [1] 0.9942405
error_knn_5_train <- 1 - accuracy_knn_train_5
error_knn_5_train
## [1] 0.005759539
knn for class 7
knn_7 <- knn(zip_X_train, zip_X_test , cl = zip_Y_train , k=7, use.all = TRUE)
conf_matrix_knn_7_test <- table(knn_7, zip_Y_test)
accuracy_knn_7_test <- sum(diag(conf_matrix_knn_7_test))/ sum(conf_matrix_knn_7_test)
accuracy_knn_7_test
## [1] 0.967033
error_knn_7_test <- 1 - accuracy_knn_7_test
error_knn_7_test
## [1] 0.03296703
knn_7_train <- knn(zip_X_train, zip_X_train , cl = zip_Y_train , k=7, use.all = TRUE)
conf_matrix_knn_7_train <- table(knn_7_train, zip_Y_train)
accuracy_knn_train_7 <- sum(diag(conf_matrix_knn_7_train))/ sum(conf_matrix_knn_7_train)
accuracy_knn_train_7
## [1] 0.9935205
error_knn_7_train <- 1 - accuracy_knn_train_7
error_knn_7_train
## [1] 0.006479482
knn for class 15
knn_15 <- knn(zip_X_train, zip_X_test , cl = zip_Y_train , k=15, use.all = TRUE)
conf_matrix_knn_15_test <- table(knn_15, zip_Y_test)
accuracy_knn_15_test <- sum(diag(conf_matrix_knn_15_test))/ sum(conf_matrix_knn_15_test)
accuracy_knn_15_test
## [1] 0.9615385
error_knn_15_test <- 1 - accuracy_knn_15_test
error_knn_15_test
## [1] 0.03846154
knn_15_train <- knn(zip_X_train, zip_X_train , cl = zip_Y_train , k=15, use.all = TRUE)
conf_matrix_knn_15_train <- table(knn_15_train, zip_Y_train)
accuracy_knn_train_15 <- sum(diag(conf_matrix_knn_15_train))/ sum(conf_matrix_knn_15_train)
accuracy_knn_train_15
## [1] 0.9906407
error_knn_15_train <- 1 - accuracy_knn_train_15
error_knn_15_train
## [1] 0.009359251
Plot Errors
x_axis = c(1,3,5,7,15)
y_axis_train = c(error_knn_1_train ,error_knn_3_train, error_knn_5_train,error_knn_7_train,error_knn_15_train)
y_axis_test = c(error_knn_1_test ,error_knn_3_test, error_knn_5_test,error_knn_7_test,error_knn_15_test)
plot_data = data.frame(x_axis, y_axis_test , y_axis_train)
library(ggplot2)
theme = theme(panel.grid=element_blank(),panel.background = element_blank(),
axis.line = element_line(colour = "black"))
ggplot(plot_data , aes(x = x_axis, y = y_axis_test)) + geom_line(aes(col = "test_knn")) + xlim(c(0,16)) + ylim(c(0,0.05)) +
geom_line(y = y_axis_train, aes(col = "train_knn")) +
geom_hline(size=1.5,linetype='dashed',aes(yintercept = error_test, col = "lm_test")) +
geom_hline(aes(col = "lm_train", yintercept = error_train),size=1.5,linetype='dashed',) +
scale_color_manual(name ="Labels", values = c(lm_test = "aquamarine4", lm_train = "aquamarine2", test_knn = "purple" , train_knn = "brown")) +
ggtitle("Error rate for linear regression vs knn") +
theme + xlab("K values") + ylab("Error rates")
Read dataset
library(ISLR)
data <- data.frame(Auto)
dim(data)
## [1] 392 9
str(data)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
pairs(data, col = "aquamarine4", upper.panel = NULL, main = "Scatterplot for Auto Dataset")
cor(data[, -9])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
lm_fit <- lm(mpg ~ ., data = data[,-9])
summary(lm_fit)
##
## Call:
## lm(formula = mpg ~ ., data = data[, -9])
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
cor(data[, -9])[1,]
## mpg cylinders displacement horsepower weight
## 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## acceleration year origin
## 0.4233285 0.5805410 0.5652088
Yes there is relationship b/w predictors and response. There is negative high corelation between mpg ~ weight ,displacement, cylinder and horsepower.Remaining fields have positive corelation with response variable mpg
Predictors displacement, weight, year and origin appear to have statistically significant relationship to response based on the above regression output and p-value
Coefficient for the year is 0.75 i.e. with every unit increase increase in year, response variable mpg increases by 75 percent it has positive relationship with the response.
plot(lm_fit , col = "aquamarine4")
The residual vs Fitted plot displays some non-linearity in the data.The Normal Q-Q plot suggests that the residuals are roughly normal. The scale-location curve indicates the residuals to have a random spread along the range to predictors, hence, we can say that the residuals are roughly homoscedastic. The Residuals vs Leverage curve shows some mild outliers and a high-leverage point (point - 14)
lm_fit_int <- lm(mpg ~ horsepower * displacement, data = data[,-9])
summary(lm_fit_int)
##
## Call:
## lm(formula = mpg ~ horsepower * displacement, data = data[, -9])
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.9391 -2.3373 -0.5816 2.1698 17.5771
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.305e+01 1.526e+00 34.77 <2e-16 ***
## horsepower -2.343e-01 1.959e-02 -11.96 <2e-16 ***
## displacement -9.805e-02 6.682e-03 -14.67 <2e-16 ***
## horsepower:displacement 5.828e-04 5.193e-05 11.22 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.944 on 388 degrees of freedom
## Multiple R-squared: 0.7466, Adjusted R-squared: 0.7446
## F-statistic: 381 on 3 and 388 DF, p-value: < 2.2e-16
lm_fit_int_col <- lm(mpg ~ horsepower:displacement, data = data[,-9])
summary(lm_fit_int_col)
##
## Call:
## lm(formula = mpg ~ horsepower:displacement, data = data[, -9])
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.1917 -3.9460 -0.9919 3.0108 18.2170
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.989e+01 3.901e-01 76.62 <2e-16 ***
## horsepower:displacement -2.694e-04 1.209e-05 -22.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.184 on 390 degrees of freedom
## Multiple R-squared: 0.56, Adjusted R-squared: 0.5589
## F-statistic: 496.4 on 1 and 390 DF, p-value: < 2.2e-16
lm_fit_all <- lm(mpg ~ (.*.), data = data[,-9])
summary(lm_fit_all)
##
## Call:
## lm(formula = mpg ~ (. * .), data = data[, -9])
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6303 -1.4481 0.0596 1.2739 11.1386
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.548e+01 5.314e+01 0.668 0.50475
## cylinders 6.989e+00 8.248e+00 0.847 0.39738
## displacement -4.785e-01 1.894e-01 -2.527 0.01192 *
## horsepower 5.034e-01 3.470e-01 1.451 0.14769
## weight 4.133e-03 1.759e-02 0.235 0.81442
## acceleration -5.859e+00 2.174e+00 -2.696 0.00735 **
## year 6.974e-01 6.097e-01 1.144 0.25340
## origin -2.090e+01 7.097e+00 -2.944 0.00345 **
## cylinders:displacement -3.383e-03 6.455e-03 -0.524 0.60051
## cylinders:horsepower 1.161e-02 2.420e-02 0.480 0.63157
## cylinders:weight 3.575e-04 8.955e-04 0.399 0.69000
## cylinders:acceleration 2.779e-01 1.664e-01 1.670 0.09584 .
## cylinders:year -1.741e-01 9.714e-02 -1.793 0.07389 .
## cylinders:origin 4.022e-01 4.926e-01 0.816 0.41482
## displacement:horsepower -8.491e-05 2.885e-04 -0.294 0.76867
## displacement:weight 2.472e-05 1.470e-05 1.682 0.09342 .
## displacement:acceleration -3.479e-03 3.342e-03 -1.041 0.29853
## displacement:year 5.934e-03 2.391e-03 2.482 0.01352 *
## displacement:origin 2.398e-02 1.947e-02 1.232 0.21875
## horsepower:weight -1.968e-05 2.924e-05 -0.673 0.50124
## horsepower:acceleration -7.213e-03 3.719e-03 -1.939 0.05325 .
## horsepower:year -5.838e-03 3.938e-03 -1.482 0.13916
## horsepower:origin 2.233e-03 2.930e-02 0.076 0.93931
## weight:acceleration 2.346e-04 2.289e-04 1.025 0.30596
## weight:year -2.245e-04 2.127e-04 -1.056 0.29182
## weight:origin -5.789e-04 1.591e-03 -0.364 0.71623
## acceleration:year 5.562e-02 2.558e-02 2.174 0.03033 *
## acceleration:origin 4.583e-01 1.567e-01 2.926 0.00365 **
## year:origin 1.393e-01 7.399e-02 1.882 0.06062 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.695 on 363 degrees of freedom
## Multiple R-squared: 0.8893, Adjusted R-squared: 0.8808
## F-statistic: 104.2 on 28 and 363 DF, p-value: < 2.2e-16
Few statistically significant interactions are displacement:year , acceleration:year and acceleration:origin
lm_fit_f_part <- lm(mpg~ I(cylinders^2) + sqrt(displacement) + sqrt(horsepower) + log(weight) +
I(acceleration^2) + year + origin , data = data[, -9])
summary(lm_fit_f_part)
##
## Call:
## lm(formula = mpg ~ I(cylinders^2) + sqrt(displacement) + sqrt(horsepower) +
## log(weight) + I(acceleration^2) + year + origin, data = data[,
## -9])
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.1069 -1.9532 -0.0089 1.7099 12.8016
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 123.571135 12.217980 10.114 < 2e-16 ***
## I(cylinders^2) 0.016429 0.025326 0.649 0.51693
## sqrt(displacement) 0.199035 0.207529 0.959 0.33813
## sqrt(horsepower) -0.328482 0.280941 -1.169 0.24304
## log(weight) -20.168541 1.913320 -10.541 < 2e-16 ***
## I(acceleration^2) 0.003127 0.002837 1.102 0.27101
## year 0.766080 0.047939 15.980 < 2e-16 ***
## origin 0.952517 0.273740 3.480 0.00056 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.106 on 384 degrees of freedom
## Multiple R-squared: 0.8445, Adjusted R-squared: 0.8416
## F-statistic: 297.9 on 7 and 384 DF, p-value: < 2.2e-16
Overall test seems to be stastically significant.Among transformations log of weight provides significant results.
library('MASS')
run_lm <- function(x) {
lm_run <- lm(crim ~ x, data = Boston)
with(Boston, plot(x , crim, col = "aquamarine4"))
abline(lm_run)
print(summary(lm_run)$coefficients)
summary(lm_run)$coefficients[2,1]
}
df_output <- as.data.frame(colnames(Boston[,-1]))
df_output <- cbind(df_output, sapply(Boston[,-1] , run_lm))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.45369376 0.4172178 10.674746 4.037668e-24
## x -0.07393498 0.0160946 -4.593776 5.506472e-06
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.0637426 0.66722830 -3.093008 2.091266e-03
## x 0.5097763 0.05102433 9.990848 1.450349e-21
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.744447 0.3961111 9.453021 1.239505e-19
## x -1.892777 1.5061155 -1.256727 2.094345e-01
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -13.71988 1.699479 -8.072992 5.076814e-15
## x 31.24853 2.999190 10.418989 3.751739e-23
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.481804 3.3644742 6.087669 2.272000e-09
## x -2.684051 0.5320411 -5.044819 6.346703e-07
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.7779063 0.94398472 -4.002084 7.221718e-05
## x 0.1077862 0.01273644 8.462825 2.854869e-16
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.499262 0.7303972 13.005611 1.502748e-33
## x -1.550902 0.1683300 -9.213458 8.519949e-19
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.2871594 0.44347583 -5.157349 3.605846e-07
## x 0.6179109 0.03433182 17.998199 2.693844e-56
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.52836909 0.815809392 -10.45387 2.773600e-23
## x 0.02974225 0.001847415 16.09939 2.357127e-47
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.646933 3.1472718 -5.607057 3.395255e-08
## x 1.151983 0.1693736 6.801430 2.942922e-11
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.55352922 1.425902755 11.609157 8.922239e-28
## x -0.03627964 0.003873154 -9.366951 2.487274e-19
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.3305381 0.69375829 -4.800718 2.087022e-06
## x 0.5488048 0.04776097 11.490654 2.654277e-27
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.7965358 0.93418916 12.62757 5.934119e-32
## x -0.3631599 0.03839017 -9.45971 1.173987e-19
All the predictors except chas are statistically significant
lm_multiple <- lm(crim ~ . , data = Boston)
summary(lm_multiple)
##
## Call:
## lm(formula = crim ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.924 -2.120 -0.353 1.019 75.051
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.033228 7.234903 2.354 0.018949 *
## zn 0.044855 0.018734 2.394 0.017025 *
## indus -0.063855 0.083407 -0.766 0.444294
## chas -0.749134 1.180147 -0.635 0.525867
## nox -10.313535 5.275536 -1.955 0.051152 .
## rm 0.430131 0.612830 0.702 0.483089
## age 0.001452 0.017925 0.081 0.935488
## dis -0.987176 0.281817 -3.503 0.000502 ***
## rad 0.588209 0.088049 6.680 6.46e-11 ***
## tax -0.003780 0.005156 -0.733 0.463793
## ptratio -0.271081 0.186450 -1.454 0.146611
## black -0.007538 0.003673 -2.052 0.040702 *
## lstat 0.126211 0.075725 1.667 0.096208 .
## medv -0.198887 0.060516 -3.287 0.001087 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.439 on 492 degrees of freedom
## Multiple R-squared: 0.454, Adjusted R-squared: 0.4396
## F-statistic: 31.47 on 13 and 492 DF, p-value: < 2.2e-16
multiple_reg_coeff <- summary(lm_multiple)$coefficients[-1,1]
df_out <- cbind(df_output,multiple_reg_coeff)
colnames(df_out) <- c("predictors","single_reg_coeff" , "multiple_reg_coeff")
plot(lm_multiple, col= "aquamarine4")
Based on above regression results, we can reject ho for rad,zn, dis, black and medv
library(ggplot2)
ggplot(data= df_out, aes(x=single_reg_coeff, y=multiple_reg_coeff)) +
geom_point(aes(color = predictors)) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black")) + xlab("Univariate coefficients") + ylab("Mutlivariate coefficients") + ggtitle("Univariate VS Multivariate coefficient")
In univariate chas was not statisitically significant but others were significant whereas in multivariate regression only zn, rad, dis, black and medv are significant
run_lm <- function(x) {
lm_run_1 <- lm(crim ~ poly(x,3) , data = Boston)
summary(lm_run_1)
}
apply(Boston[,-4],2,run_lm)
## Warning in summary.lm(lm_run_1): essentially perfect fit: summary may be
## unreliable
## $crim
##
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.318e-14 -3.980e-16 4.500e-17 1.270e-16 5.602e-14
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.614e+00 1.444e-16 2.502e+16 < 2e-16 ***
## poly(x, 3)1 1.933e+02 3.248e-15 5.951e+16 < 2e-16 ***
## poly(x, 3)2 1.574e-14 3.248e-15 4.845e+00 1.69e-06 ***
## poly(x, 3)3 -1.546e-14 3.248e-15 -4.758e+00 2.56e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.248e-15 on 502 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 1.18e+33 on 3 and 502 DF, p-value: < 2.2e-16
##
##
## $zn
##
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.821 -4.614 -1.294 0.473 84.130
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3722 9.709 < 2e-16 ***
## poly(x, 3)1 -38.7498 8.3722 -4.628 4.7e-06 ***
## poly(x, 3)2 23.9398 8.3722 2.859 0.00442 **
## poly(x, 3)3 -10.0719 8.3722 -1.203 0.22954
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.372 on 502 degrees of freedom
## Multiple R-squared: 0.05824, Adjusted R-squared: 0.05261
## F-statistic: 10.35 on 3 and 502 DF, p-value: 1.281e-06
##
##
## $indus
##
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.278 -2.514 0.054 0.764 79.713
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.614 0.330 10.950 < 2e-16 ***
## poly(x, 3)1 78.591 7.423 10.587 < 2e-16 ***
## poly(x, 3)2 -24.395 7.423 -3.286 0.00109 **
## poly(x, 3)3 -54.130 7.423 -7.292 1.2e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.423 on 502 degrees of freedom
## Multiple R-squared: 0.2597, Adjusted R-squared: 0.2552
## F-statistic: 58.69 on 3 and 502 DF, p-value: < 2.2e-16
##
##
## $nox
##
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.110 -2.068 -0.255 0.739 78.302
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3216 11.237 < 2e-16 ***
## poly(x, 3)1 81.3720 7.2336 11.249 < 2e-16 ***
## poly(x, 3)2 -28.8286 7.2336 -3.985 7.74e-05 ***
## poly(x, 3)3 -60.3619 7.2336 -8.345 6.96e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.234 on 502 degrees of freedom
## Multiple R-squared: 0.297, Adjusted R-squared: 0.2928
## F-statistic: 70.69 on 3 and 502 DF, p-value: < 2.2e-16
##
##
## $rm
##
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.485 -3.468 -2.221 -0.015 87.219
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3703 9.758 < 2e-16 ***
## poly(x, 3)1 -42.3794 8.3297 -5.088 5.13e-07 ***
## poly(x, 3)2 26.5768 8.3297 3.191 0.00151 **
## poly(x, 3)3 -5.5103 8.3297 -0.662 0.50858
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.33 on 502 degrees of freedom
## Multiple R-squared: 0.06779, Adjusted R-squared: 0.06222
## F-statistic: 12.17 on 3 and 502 DF, p-value: 1.067e-07
##
##
## $age
##
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.762 -2.673 -0.516 0.019 82.842
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3485 10.368 < 2e-16 ***
## poly(x, 3)1 68.1820 7.8397 8.697 < 2e-16 ***
## poly(x, 3)2 37.4845 7.8397 4.781 2.29e-06 ***
## poly(x, 3)3 21.3532 7.8397 2.724 0.00668 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.84 on 502 degrees of freedom
## Multiple R-squared: 0.1742, Adjusted R-squared: 0.1693
## F-statistic: 35.31 on 3 and 502 DF, p-value: < 2.2e-16
##
##
## $dis
##
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.757 -2.588 0.031 1.267 76.378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3259 11.087 < 2e-16 ***
## poly(x, 3)1 -73.3886 7.3315 -10.010 < 2e-16 ***
## poly(x, 3)2 56.3730 7.3315 7.689 7.87e-14 ***
## poly(x, 3)3 -42.6219 7.3315 -5.814 1.09e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.331 on 502 degrees of freedom
## Multiple R-squared: 0.2778, Adjusted R-squared: 0.2735
## F-statistic: 64.37 on 3 and 502 DF, p-value: < 2.2e-16
##
##
## $rad
##
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.381 -0.412 -0.269 0.179 76.217
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.2971 12.164 < 2e-16 ***
## poly(x, 3)1 120.9074 6.6824 18.093 < 2e-16 ***
## poly(x, 3)2 17.4923 6.6824 2.618 0.00912 **
## poly(x, 3)3 4.6985 6.6824 0.703 0.48231
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.682 on 502 degrees of freedom
## Multiple R-squared: 0.4, Adjusted R-squared: 0.3965
## F-statistic: 111.6 on 3 and 502 DF, p-value: < 2.2e-16
##
##
## $tax
##
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.273 -1.389 0.046 0.536 76.950
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3047 11.860 < 2e-16 ***
## poly(x, 3)1 112.6458 6.8537 16.436 < 2e-16 ***
## poly(x, 3)2 32.0873 6.8537 4.682 3.67e-06 ***
## poly(x, 3)3 -7.9968 6.8537 -1.167 0.244
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.854 on 502 degrees of freedom
## Multiple R-squared: 0.3689, Adjusted R-squared: 0.3651
## F-statistic: 97.8 on 3 and 502 DF, p-value: < 2.2e-16
##
##
## $ptratio
##
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.833 -4.146 -1.655 1.408 82.697
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.614 0.361 10.008 < 2e-16 ***
## poly(x, 3)1 56.045 8.122 6.901 1.57e-11 ***
## poly(x, 3)2 24.775 8.122 3.050 0.00241 **
## poly(x, 3)3 -22.280 8.122 -2.743 0.00630 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.122 on 502 degrees of freedom
## Multiple R-squared: 0.1138, Adjusted R-squared: 0.1085
## F-statistic: 21.48 on 3 and 502 DF, p-value: 4.171e-13
##
##
## $black
##
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.096 -2.343 -2.128 -1.439 86.790
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3536 10.218 <2e-16 ***
## poly(x, 3)1 -74.4312 7.9546 -9.357 <2e-16 ***
## poly(x, 3)2 5.9264 7.9546 0.745 0.457
## poly(x, 3)3 -4.8346 7.9546 -0.608 0.544
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.955 on 502 degrees of freedom
## Multiple R-squared: 0.1498, Adjusted R-squared: 0.1448
## F-statistic: 29.49 on 3 and 502 DF, p-value: < 2.2e-16
##
##
## $lstat
##
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.234 -2.151 -0.486 0.066 83.353
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3392 10.654 <2e-16 ***
## poly(x, 3)1 88.0697 7.6294 11.543 <2e-16 ***
## poly(x, 3)2 15.8882 7.6294 2.082 0.0378 *
## poly(x, 3)3 -11.5740 7.6294 -1.517 0.1299
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.629 on 502 degrees of freedom
## Multiple R-squared: 0.2179, Adjusted R-squared: 0.2133
## F-statistic: 46.63 on 3 and 502 DF, p-value: < 2.2e-16
##
##
## $medv
##
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.427 -1.976 -0.437 0.439 73.655
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.614 0.292 12.374 < 2e-16 ***
## poly(x, 3)1 -75.058 6.569 -11.426 < 2e-16 ***
## poly(x, 3)2 88.086 6.569 13.409 < 2e-16 ***
## poly(x, 3)3 -48.033 6.569 -7.312 1.05e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.569 on 502 degrees of freedom
## Multiple R-squared: 0.4202, Adjusted R-squared: 0.4167
## F-statistic: 121.3 on 3 and 502 DF, p-value: < 2.2e-16
Based on output factor like zn, rm ,rad, tax,lstat the cubic transformation are not statistically significant but are significant at single and quadratic degree. Black is significant only at single degree Whereas inuds, nox,age,dis,ptratio ,medv are significant at 3 degree of polynomial.